Date: Comparing number of incidents by year, quarter and month
For this first part I want to explore the relationship between time and number of incidents. Based on the analysis by year, quarter and month to see when the most incident occurs and see whether it’s increasing or decreasing.
# load packages
library(tidyr)
library( lubridate )
#inspect data
str(Gun_violence)
'data.frame': 239677 obs. of 22 variables:
$ incident_id : int 461105 460726 478855 478925 478959 478948 479363 479374 479389 492151 ...
$ date : chr "1/1/13" "1/1/13" "1/1/13" "1/5/13" ...
$ state : chr "Pennsylvania" "California" "Ohio" "Colorado" ...
$ city_or_county : chr "Mckeesport" "Hawthorne" "Lorain" "Aurora" ...
$ address : chr "1506 Versailles Avenue and Coursin Street" "13500 block of Cerise Avenue" "1776 East 28th Street" "16000 block of East Ithaca Place" ...
$ n_killed : int 0 1 1 4 2 4 5 0 0 1 ...
$ n_injured : int 4 3 3 0 2 0 0 5 4 6 ...
$ congressional_district : int 14 43 9 6 6 1 1 2 9 7 ...
$ gun_stolen : chr "" "" "0::Unknown||1::Unknown" "" ...
$ gun_type : chr "" "" "0::Unknown||1::Unknown" "" ...
$ incident_characteristics: chr "Shot - Wounded/Injured||Mass Shooting (4+ victims injured or killed excluding the subject/suspect/perpetrator, "| __truncated__ "Shot - Wounded/Injured||Shot - Dead (murder, accidental, suicide)||Mass Shooting (4+ victims injured or killed "| __truncated__ "Shot - Wounded/Injured||Shot - Dead (murder, accidental, suicide)||Shots Fired - No Injuries||Bar/club incident"| __truncated__ "Shot - Dead (murder, accidental, suicide)||Officer Involved Incident||Officer Involved Shooting - subject/suspe"| __truncated__ ...
$ location_description : chr "" "" "Cotton Club" "" ...
$ n_guns_involved : int NA NA 2 NA 2 NA 2 NA NA NA ...
$ notes : chr "Julian Sims under investigation: Four Shot and Injured" "Four Shot; One Killed; Unidentified shooter in getaway car" "" "" ...
$ participant_age : chr "0::20" "0::20" "0::25||1::31||2::33||3::34||4::33" "0::29||1::33||2::56||3::33" ...
$ participant_age_group : chr "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+||4::Adult 18+" "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+" "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+||4::Adult 18+" "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+" ...
$ participant_gender : chr "0::Male||1::Male||3::Male||4::Female" "0::Male" "0::Male||1::Male||2::Male||3::Male||4::Male" "0::Female||1::Male||2::Male||3::Male" ...
$ participant_relationship: chr "" "" "" "" ...
$ participant_status : chr "0::Arrested||1::Injured||2::Injured||3::Injured||4::Injured" "0::Killed||1::Injured||2::Injured||3::Injured" "0::Injured, Unharmed, Arrested||1::Unharmed, Arrested||2::Killed||3::Injured||4::Injured" "0::Killed||1::Killed||2::Killed||3::Killed" ...
$ participant_type : chr "0::Victim||1::Victim||2::Victim||3::Victim||4::Subject-Suspect" "0::Victim||1::Victim||2::Victim||3::Victim||4::Subject-Suspect" "0::Subject-Suspect||1::Subject-Suspect||2::Victim||3::Victim||4::Victim" "0::Victim||1::Victim||2::Victim||3::Subject-Suspect" ...
$ state_house_district : int NA 62 56 40 62 72 10 93 11 NA ...
$ state_senate_district : int NA 35 13 28 27 11 14 5 7 44 ...
# modify table: reform the date
Gun_violence_new <-
Gun_violence %>%
mutate(date = mdy(date)) %>% # make the date variable tidier
mutate(year = lubridate::year(date),month = lubridate::month(date),day = lubridate::day(date)) # split the date variable to three different variable: year, month, date
#explore modified data table
Gun_violence_new
str(Gun_violence_new)
'data.frame': 239677 obs. of 25 variables:
$ incident_id : int 461105 460726 478855 478925 478959 478948 479363 479374 479389 492151 ...
$ date : Date, format: "2013-01-01" "2013-01-01" "2013-01-01" "2013-01-05" ...
$ state : chr "Pennsylvania" "California" "Ohio" "Colorado" ...
$ city_or_county : chr "Mckeesport" "Hawthorne" "Lorain" "Aurora" ...
$ address : chr "1506 Versailles Avenue and Coursin Street" "13500 block of Cerise Avenue" "1776 East 28th Street" "16000 block of East Ithaca Place" ...
$ n_killed : int 0 1 1 4 2 4 5 0 0 1 ...
$ n_injured : int 4 3 3 0 2 0 0 5 4 6 ...
$ congressional_district : int 14 43 9 6 6 1 1 2 9 7 ...
$ gun_stolen : chr "" "" "0::Unknown||1::Unknown" "" ...
$ gun_type : chr "" "" "0::Unknown||1::Unknown" "" ...
$ incident_characteristics: chr "Shot - Wounded/Injured||Mass Shooting (4+ victims injured or killed excluding the subject/suspect/perpetrator, "| __truncated__ "Shot - Wounded/Injured||Shot - Dead (murder, accidental, suicide)||Mass Shooting (4+ victims injured or killed "| __truncated__ "Shot - Wounded/Injured||Shot - Dead (murder, accidental, suicide)||Shots Fired - No Injuries||Bar/club incident"| __truncated__ "Shot - Dead (murder, accidental, suicide)||Officer Involved Incident||Officer Involved Shooting - subject/suspe"| __truncated__ ...
$ location_description : chr "" "" "Cotton Club" "" ...
$ n_guns_involved : int NA NA 2 NA 2 NA 2 NA NA NA ...
$ notes : chr "Julian Sims under investigation: Four Shot and Injured" "Four Shot; One Killed; Unidentified shooter in getaway car" "" "" ...
$ participant_age : chr "0::20" "0::20" "0::25||1::31||2::33||3::34||4::33" "0::29||1::33||2::56||3::33" ...
$ participant_age_group : chr "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+||4::Adult 18+" "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+" "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+||4::Adult 18+" "0::Adult 18+||1::Adult 18+||2::Adult 18+||3::Adult 18+" ...
$ participant_gender : chr "0::Male||1::Male||3::Male||4::Female" "0::Male" "0::Male||1::Male||2::Male||3::Male||4::Male" "0::Female||1::Male||2::Male||3::Male" ...
$ participant_relationship: chr "" "" "" "" ...
$ participant_status : chr "0::Arrested||1::Injured||2::Injured||3::Injured||4::Injured" "0::Killed||1::Injured||2::Injured||3::Injured" "0::Injured, Unharmed, Arrested||1::Unharmed, Arrested||2::Killed||3::Injured||4::Injured" "0::Killed||1::Killed||2::Killed||3::Killed" ...
$ participant_type : chr "0::Victim||1::Victim||2::Victim||3::Victim||4::Subject-Suspect" "0::Victim||1::Victim||2::Victim||3::Victim||4::Subject-Suspect" "0::Subject-Suspect||1::Subject-Suspect||2::Victim||3::Victim||4::Victim" "0::Victim||1::Victim||2::Victim||3::Subject-Suspect" ...
$ state_house_district : int NA 62 56 40 62 72 10 93 11 NA ...
$ state_senate_district : int NA 35 13 28 27 11 14 5 7 44 ...
$ year : num 2013 2013 2013 2013 2013 ...
$ month : num 1 1 1 1 1 1 1 1 1 1 ...
$ day : int 1 1 1 5 7 7 19 21 21 23 ...
Yearly incidents
# create yearly incidents table of the U.S. from 2014-2017
yearly <-
Gun_violence_new %>%
filter(year!= 2013, year != 2018) %>% # exclude 2013 and 2018 since the data are incomplete
group_by(year) %>%
summarise(yearly= n()) %>%
mutate(year_average = mean(yearly)) # compute new variable of average number of incidents of 4 years
`summarise()` ungrouping output (override with `.groups` argument)
yearly
NA
Yearly bar charts
# plot yearly graph with average line
ggplot(yearly) +
geom_col(aes(x = year, y = yearly), size = 1, color = "darkblue", fill = "grey") +
geom_line(aes(x = year, y = year_average), size = 1.5, color="red", group = 1) +
scale_y_continuous(sec.axis = sec_axis(~./3, name = "year_average"))+geom_text(aes(x = year, y = yearly, label = yearly), vjust = -0.2)+theme_linedraw()

It seems as if most incidents in 2013 were not recorded. And 2018 only has first quarter recorded. So I only used the rest four years. And is obvious that number of incidents are increasing as year passing.
Quarterly incidents
# load package
library(ggplot2)
# Get quarter as new variable
Gun_violence_new$quarter <- quarter(Gun_violence_new$date)
# create yearly incidents table of the U.S. from 2014-2017
quarterly <-
Gun_violence_new %>%
filter(year!=2013) %>% #exclude 2013 since the data is incomplete
select(year, quarter) %>%
group_by(year) %>%
count(quarter) %>%
# plot quarterly bargraph and set each year as a facet
ggplot(aes(x=as.factor(quarter), y=n)) + geom_bar(stat='identity', fill = "violet") + scale_y_continuous(labels=comma) + facet_grid(.~year) + labs(x='Quarter', y='Number of incidents')+geom_text(aes(label = n), vjust = 1.5)+theme_linedraw() + facet_wrap(.~year,ncol=2)
quarterly

Same reason as the yearly graph, I excluded 2013, In addition, 2018 is not a full year of data as the latest recorded incident was on May 31st 2018. As this means exactly one quarter, I am interested if the trend is still upward if I only look at the Q1s of 2014-2018. I will look into this in the next section. I separate each year to it’s own grid to make the visualization better. And in the quarterly graph, there seems to be some sort of ‘seasonality’ with Q1 and Q4 generally having lower numbers of incidents than Q2 and Q3.
#compare quarter 1 for each year
first_quarter <-
Gun_violence_new %>%
filter(year!=2013 & quarter==1) %>% # since 2018 only have quarter 1 recorded and 2013 does not have data
select(year, quarter) %>%
group_by(year) %>%
count(quarter) %>%
# plot first quarter bargraph
ggplot(aes(x=as.factor(year), y=n)) + geom_bar(stat='identity',fill = "darkblue") + scale_y_continuous(labels=comma) + labs(x='Incidents in Q1 of each year', y='Number of incidents') + geom_text(aes(label = n), vjust = -0.2) + theme_linedraw()
first_quarter

Since 2018 only have quarter 1 recorded, I try to compare only the first quarter for each year. The second graph shows that Q1 2018 holds less incidents than Q1 2017. This could be seen as a cautiously positive sign. However, one should realize that this number is still very high when compared to other countries (relatively).
Monthly incidents
# load package
library(scales)
#create monthly table for each year
monthly1<-
Gun_violence_new %>%
filter(year!= 2013) %>%
group_by(year,month) %>%
summarise(monthly= n())
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# get monthly average for each month
monthly2<-
Gun_violence_new %>%
filter(year!= 2013, year != 2018) %>%
group_by(month) %>%
summarise(months= n()) %>%
mutate(month_average = months/4) # compute new variable of average number of incidents of each month
`summarise()` ungrouping output (override with `.groups` argument)
# left join two tables together
monthly<-
monthly1 %>%
left_join(monthly2, by = c("month"= "month"))
monthly
Monthly bar charts
# plot monthly graph with average line for each month
monthly%>%
ggplot(aes(x=month, y=monthly)) + geom_bar(stat='identity',fill="darkseagreen3") + facet_wrap(.~year,ncol=3) + labs(x='monthly', y='Number of incidents')+scale_x_continuous(breaks = pretty_breaks())+theme_linedraw() +geom_line(aes(x = month, y = month_average), size = 1, color="coral4", group = 1)

The analysis of quarters shows that more incidents occur in the warmer spring and summer periods. This seems worth diving into a little deeper.
The most visible ‘seasonality’ effect seems to me that the colder months seem to have less incidents. November, December, and February are the 3 months with the lowest number of incidents (February also only has 28 days of course). The exception seems to be January, which is worth investigating later on. My first idea is that possibly incidents on new years eve contribute to January having a high number of incidents.The other peak is the July/August period. I think that the fact that many people go on holidays in this period is the most likely explanation.
And even we observe that 2018 first quarter is slightly less than 2017, we can still see that all three month are still high than the monthly average, so we still need to take this seriously.
Location: Comparing number of incidents by state and city.
In the second part of observation, I want to know the relationship between the location and incidents to see where most incidents happened and try to conclude some possible reasons.
Top 10 states with most incidents
# the top 10 states with the most gun-related violence during the period
top10State <-
Gun_violence %>%
group_by(state) %>%
summarise(case_count= n()) %>%
arrange(desc(case_count)) %>% # make data in descending order
head(10)
`summarise()` ungrouping output (override with `.groups` argument)
top10State
top10State bar charts
#
ggplot(data=top10State, aes(x=state, y=case_count)) + geom_bar(stat="identity",fill="darksalmon") + geom_text(aes(label = case_count), vjust = -0.2)

For this first graph, I just simply compute the first 10 states with most incidents between 2013 and 2018. And we can see that Pennsylvania is also in the top 10. However, these numbers should be related to the numbers of inhabitants to get a good view of the relative numbers of incidents. For instance, California is a state with a very large population. Therefore, it is no surprise to see California as the number two in absolute numbers.
#Incidents of each State in descending order
topState <-
Gun_violence %>%
group_by(state) %>%
summarise(case_count= n()) %>%
arrange(desc(case_count))
`summarise()` ungrouping output (override with `.groups` argument)
#Incidents relative to the State population size
crimedensity<-
topState %>%
left_join(State_ZipGeography, by = c("state"= "State")) %>%
mutate(per_million=case_count/(population/1000000)) # create a new variable that show incidents per million inhabintants
crimedensity
Incident desdity bar charts
# Plot bar charts of incidents density graph
crimedensity %>%
filter(state!="District of Columbia")%>% # filter out District of Columbia since it's a outlier
ggplot(aes(y=reorder(state, per_million), x=per_million, fill=per_million, text=state,width=.7))+geom_bar(stat='identity')+theme(legend.position="none")+labs(x='Incidents per million people', y='State')+theme_linedraw()+ scale_fill_gradient(low="lightgreen", high="seagreen4")

As I indicated earlier that incident numbers should be related to the numbers of inhabitants, so for this graph, I join the two table so we can compare incident number with population. Note: actually the District of Columbia came out as the state with the highest incidents rate. But it’s not really a state and it creates a outlier, so I choose to eliminate it for this graph.
In the figure above, you can see that the enriched data, which are related to the population of each state, paint a very different picture. As the numbers of incidents are related to the population sizes, these numbers now represent ‘real’ gun danger levels. To show this visually, I have used color codes. Dark green means a high danger level in terms of relative numbers of incidents, and light green means that a state is relatively safe.
Now, Alaska, Louisiana and Delaware are showing the highest relative incident numbers. Hawaii seems the safest state to live in , and the large state of California drops from second state in terms of absolute incidents to a lower position when corrected for the large population.
install.packages("usmap")
Error in install.packages : Updating loaded packages
#load package
library(usmap)
#Graph gun-related incidents in the U.S map
plot_usmap(data = crimedensity,values="per_million",color="blue") +
labs(title = "Gun-related incidents in the U.S",
subtitle = "Source: GVA, 2013-2018") + scale_fill_continuous(low="white",high="darkred",name="incident_density",label=scales::comma)

theme(legend.position = "right")
List of 1
$ legend.position: chr "right"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
To make the visualization much clearer, I compute a map so that we can see that per million inhabitants, states at south-east part of the U.S. tend to have more incidents.
Victims by State
#add new variable: victims
Gun_violence_new$victims <- Gun_violence_new$n_killed + Gun_violence_new$n_injured
Victims1 <-
Gun_violence_new %>%
group_by(state) %>%
summarize(sumVic=sum(victims), sumInj=sum(n_injured), sumDeath=sum(n_killed), PercDeath=round(sumDeath/sumVic,2), sumIncidents=n(), vicPerInc=round(sumVic/sumIncidents,2))
`summarise()` ungrouping output (override with `.groups` argument)
victims<-
Victims1 %>%
left_join(State_ZipGeography, by = c("state"= "State"))
head(victims)
As you can see in the table above, Alaska actually has a relatively low number of victims per incident (0.44). However, as shown in the density graph, Alaska has the highest density.
Graph of relationship between state populatio and victim per incident
# Scatter plot with regression line
victims %>%
ggplot(aes(x=population, y=vicPerInc)) + geom_point(stat='identity',color="darkred") +
labs(x='population', y='Victims per incidents') +geom_smooth(method=lm)+theme_linedraw()

In the graph above, which uses the scatter plot with regression line, you can see that there is positive relationship between the number of victims relative to the population size. This is no surprise, as the state has higher population tend to have more incidents and ranks high regarding the victims per incident ratio.
Type: Comparing number of incidents by characteristics.
At last, since the ‘incident_characteristics` field actually holds a lot in information. I choose to get an general idea about the characteristic of incidents that happen most in the U.S. And compare some popular cities.
Gun_violence_new$incident_characteristics <- gsub("\\|\\|", "|", Gun_violence_new$incident_characteristics)
#download package
install.packages("splitstackshape")
Error in install.packages : Updating loaded packages
install.packages("gridExtra")
Error in install.packages : Updating loaded packages
library(splitstackshape)
library(gridExtra )
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
Incident types ranking for the U.S.
#download package
install.packages("splitstackshape")
Error in install.packages : Updating loaded packages
library(splitstackshape)
library(gridExtra )```
Error: attempt to use zero-length variable name
Incident_type%>%
count(incident_characteristics) %>%
top_n(30, wt=n) %>%
ggplot(aes(x=reorder(incident_characteristics, n), y=n)) +
geom_bar(stat='identity', fill='orange') +coord_flip() + labs(x='Incident Category', y='number of incidents')+theme_linedraw()

As you can see, there are lots of different categories. These do not necessarily all involve shots being fired. I for instance assume that ‘Non-Shooting Incident’ means that people have just threatened to shoot, or possible have used their gun as a striking weapon. And most incidents are described as Shot - Wounded/Injured.
As the first 4 categories seem overall categories, I will display numbers on these 4 categories separately in the remainder of next section.
graph of main incident categories by popular city
overall_categories <- c("Shot - Wounded/Injured", "Shot - Dead (murder, accidental, suicide)", "Non-Shooting Incident", "Shots Fired - No Injuries")
coloursShot <- c("Shot - Wounded/Injured"="red", "Shot - Dead (murder, accidental, suicide)"="darkred", "Non-Shooting Incident"="green", "Shots Fired - No Injuries"="yellow")
uscategories <- function(fixedX=0.5){
Incident_type %>%
filter(incident_characteristics %in% overall_categories) %>%
count(incident_characteristics) %>%
ggplot(aes(x=reorder(incident_characteristics, n), y=n/sum(n), fill=factor(incident_characteristics))) +
geom_bar(stat='identity', width = 0.5) + scale_fill_manual(values = coloursShot) +
theme(legend.position="none") + coord_flip(ylim = c(0, fixedX)) + labs(x="", y='US overall') +
scale_y_continuous(labels=percent)
}
#
citycategories <- function(cityName){
Incident_type %>%
filter(city_or_county==cityName & incident_characteristics %in% overall_categories) %>%
count(incident_characteristics) %>%
ggplot(aes(x=reorder(incident_characteristics, n), y=n/sum(n), fill=factor(incident_characteristics))) +
geom_bar(stat='identity', width = 0.5) + scale_fill_manual(values = coloursShot) +
theme(legend.position="none") + coord_flip(ylim = c(0, 0.8)) + labs(x="", y=cityName) +
scale_y_continuous(labels=percent)
}
usOverall_categories <- uscategories(0.8)
baltimore_categories <- citycategories('Baltimore')
washington_categories <- citycategories('Washington')
chicago_categories <- citycategories('Chicago')
orlando_categories <- citycategories('Orlando')
grid.arrange(usOverall_categories, baltimore_categories, washington_categories, chicago_categories,orlando_categories, ncol=1)

Base on the graph above, we can see that except Washington, Shot - Dead (murder, accidental, suicide)" is always more than "Non-Shooting Incident in many popular cities
In Baltimore the percentage of incidents with people wounded is high (20% higher than US average). In addition, there are very few incidents with shots fired and people not injured.
And in Chicago, the percentage of incidents with wounded people is very high. On the other hand, the percentage of incidents with deadly victims is a bit lower than the US average.